Link to Git repo: https://github.com/nktang05/kibera.git
Link to Analysis: https://nktang05.github.io/kibera/KiberaAnalysis.html
# read in data
setwd("~/Desktop/GRIT/Kibera")
data <- fread("~/Desktop/GRIT/Kibera/kibera_values_data.csv", header = TRUE)
# get rid on unnecessary data columns
data$StartDate<- NULL
data$EndDate<- NULL
data$Status<- NULL
data$IPAddress<- NULL
data$Progress<- NULL
data$'Duration (in seconds)'<- NULL
data$Finished<- NULL
data$ RecordedDate<- NULL
data$ResponseId<- NULL
data$RecipientLastName<- NULL
data$RecipientFirstName<- NULL
data$RecipientEmail<- NULL
data$ExternalReference<- NULL
data$LocationLatitude<- NULL
data$LocationLongitude<- NULL
data$DistributionChannel<- NULL
data$UserLanguage<- NULL
data$"2.11_7_TEXT"<- NULL
data$"2.13_7_TEXT"<- NULL
data$"2.20_5_TEXT"<- NULL
data$"3.2_8_TEXT"<- NULL
data$"3.16_6_TEXT"<- NULL
data$"4.21_5_TEXT"<- NULL
data$"5.1_5_TEXT"<- NULL
data$"5.12_6_TEXT"<- NULL
#summary(data)
#set aside variable labels
variable_labels <- as.character(unlist(data[1, ]))
# drop non data rows
data <- data[-c(1, 2), ]
# change names of
names(data) <- ifelse(grepl("^[0-9]", names(data)),
paste0("x", names(data)),
names(data))
# make var numeric
numericVars <- c("x1.1", "x1.2", "x1.3", "x2.1", "x3.1_1_TEXT", "x3.9" )
for (col in numericVars) {
data[[col]] <- as.numeric(as.character(data[[col]]))
}
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
# make var date
data$x1.4 <- as.Date(data$x1.4, format = "%d/%m/%Y")
# make var string
charVars <- c("x2.5_5_TEXT" , "x2.7_6_TEXT", "x2.10_8_TEXT", "x2.12_10_TEXT", "x2.14_5_TEXT", "x2.16_7_TEXT", "x2.21_9_TEXT", "x3.3_7_TEXT", "x3.13_8_TEXT",
"x3.14_7_TEXT", "x3.17_7_TEXT", "x4.10_1_TEXT", "x7.9_6_TEXT")
for (col in charVars) {
data[[col]] <- as.character(data[[col]])
}
# make var factor
allVars <- names(data)
excludeVars <- c(numericVars, charVars, "x1.4")
factorVars <- setdiff(allVars, excludeVars)
for (col in factorVars) {
data[[col]] <- as.factor(data[[col]])
}
names(data)
## [1] "x1.1" "x1.2" "x1.3" "x1.4"
## [5] "x2.1" "x2.2" "x2.3" "x2.4"
## [9] "x2.5" "x2.5_5_TEXT" "x2.6" "x2.7"
## [13] "x2.7_6_TEXT" "x2.8" "x2.9" "x2.10"
## [17] "x2.10_8_TEXT" "x2.11" "x2.12" "x2.12_10_TEXT"
## [21] "x2.13" "x2.14" "x2.14_5_TEXT" "x2.15"
## [25] "x2.16" "x2.16_7_TEXT" "x2.17" "x2.18"
## [29] "x2.19.1" "x2.19.2" "x2.19.3" "x2.20"
## [33] "x2.21" "x2.21_9_TEXT" "x3.1" "x3.1_1_TEXT"
## [37] "x3.2" "x3.3" "x3.3_7_TEXT" "x3.4"
## [41] "x3.5" "x3.6" "x3.7" "x3.8"
## [45] "x3.9" "x3.10" "x3.11" "x3.12"
## [49] "x3.13" "x3.13_8_TEXT" "x3.14" "x3.14_7_TEXT"
## [53] "x3.15" "x3.16" "x3.17" "x3.17_7_TEXT"
## [57] "x3.18" "x3.19" "x3.20" "x4.1"
## [61] "x4.2" "x4.3" "x4.4" "x4.5"
## [65] "x4.6" "x4.7" "x4.8" "x4.9"
## [69] "x4.10" "x4.10_1_TEXT" "x4.11" "x4.12"
## [73] "x4.13" "x4.14" "x4.15" "x4.16"
## [77] "x4.17" "x4.18" "x4.19" "x4.20"
## [81] "x4.21" "x4.22" "x5.23" "x4.24"
## [85] "x4.25" "x4.26" "x4.27" "x4.28"
## [89] "x4.29" "x4.30" "x5.1" "x5.2"
## [93] "x5.3" "x5.4" "x5.5" "x5.6"
## [97] "x5.7" "x5.8" "x5.9" "x5.10"
## [101] "x5.11" "x5.12" "x5.13" "x5.14"
## [105] "x5.15" "x5.16" "x6.1" "x6.2"
## [109] "x6.3" "x6.4" "x7.1" "x7.2"
## [113] "x7.3" "x7.4" "x7.5" "x7.6"
## [117] "x7.7" "x7.8" "x7.9" "x7.9_6_TEXT"
## [121] "x7.10" "x7.11" "x7.12" "x7.13"
## [125] "x7.14" "x7.15" "x7.16" "x7.17"
## [129] "x8.1" "x8.2" "x8.3" "x8.4"
## [133] "x8.5" "x8.6" "x8.7" "x8.8"
## [137] "x8.9" "x9.1" "x9.2" "x9.3"
## [141] "x9.4" "x9.5" "x9.6" "x9.7"
## [145] "x9.8" "x9.9"
summary(data)
## x1.1 x1.2 x1.3 x1.4
## Min. : 1.000 Min. : 1.0 Min. : 1.0 Min. :0025-03-19
## 1st Qu.: 2.000 1st Qu.:134.8 1st Qu.: 10.0 1st Qu.:2025-03-19
## Median : 3.000 Median :270.5 Median : 19.0 Median :2025-03-19
## Mean : 3.548 Mean :282.2 Mean : 19.2 Mean :2016-09-11
## 3rd Qu.: 5.000 3rd Qu.:430.2 3rd Qu.: 27.0 3rd Qu.:2025-03-20
## Max. :55.000 Max. :614.0 Max. :303.0 Max. :2025-09-20
## NA's :13 NA's :20 NA's :7 NA's :50
## x2.1 x2.2 x2.3 x2.4 x2.5 x2.5_5_TEXT
## Min. : 8.00 : 9 : 65 :314 :305 Length:520
## 1st Qu.: 16.00 1:340 1:221 7 : 47 1:161 Class :character
## Median : 18.00 2:171 2:234 9 : 37 2: 35 Mode :character
## Mean : 56.03 10 : 35 3: 2
## 3rd Qu.: 20.00 8 : 31 4: 13
## Max. :18415.00 6 : 14 5: 4
## NA's :31 (Other): 42
## x2.6 x2.7 x2.7_6_TEXT x2.8 x2.9 x2.10
## :305 : 13 Length:520 : 21 : 15 6 :256
## 14 : 98 1:445 Class :character 1:200 1:398 7 :111
## 11 : 74 2: 11 Mode :character 2: 37 2: 62 5 : 60
## 7 : 13 3: 38 3: 13 3: 37 1 : 47
## 10 : 9 4: 9 4: 68 4: 7 2 : 23
## 9 : 7 5: 2 5:181 5: 1 : 13
## (Other): 14 6: 2 (Other): 10
## x2.10_8_TEXT x2.11 x2.12 x2.12_10_TEXT x2.13 x2.14
## Length:520 : 15 3 :132 Length:520 : 12 : 17
## Class :character 1: 9 1 :128 Class :character 1: 32 1: 61
## Mode :character 2:398 4 :107 Mode :character 2:452 2:265
## 3: 18 2 : 82 3: 10 3:103
## 4: 53 5 : 28 4: 9 4: 73
## 5: 8 : 14 5: 5 5: 1
## 6: 19 (Other): 29
## x2.14_5_TEXT x2.15 x2.16 x2.16_7_TEXT x2.17
## Length:520 :150 1 :214 Length:520 : 13
## Class :character 5 : 82 2 :208 Class :character 2: 46
## Mode :character 4 : 67 5 : 27 Mode :character 3:258
## 6 : 66 : 21 4:187
## 3 : 45 6 : 18 5: 16
## 7 : 38 3 : 13
## (Other): 72 (Other): 19
## x2.18 x2.19.1 x2.19.2 x2.19.3 x2.20 x2.21 x2.21_9_TEXT
## : 16 : 26 : 55 : 64 : 20 2 :277 Length:520
## 1:440 1 :206 1 : 32 1 :324 1:465 4 :108 Class :character
## 2: 24 1,2: 2 2 :174 1,3: 2 2: 19 3 : 64 Mode :character
## 3: 32 1,3: 3 2,3: 1 2 : 75 3: 4 1 : 34
## 4: 8 2 :141 3 :234 3 : 47 4: 12 : 14
## 3 :134 4 : 24 4 : 8 9 : 13
## 4 : 8 (Other): 10
## x3.1 x3.1_1_TEXT x3.2 x3.3 x3.3_7_TEXT x3.4
## :125 Min. : 9.00 2 :229 :219 Length:520 :224
## 1:297 1st Qu.:13.00 :219 1 :211 Class :character 1: 67
## 2: 98 Median :14.00 1 : 43 5 : 21 Mode :character 2:144
## Mean :14.03 3 : 9 3 : 20 3: 55
## 3rd Qu.:15.00 4 : 9 7 : 15 4: 30
## Max. :20.00 1,2 : 5 2 : 14
## NA's :344 (Other): 6 (Other): 20
## x3.5 x3.6 x3.7 x3.8 x3.9 x3.10 x3.11 x3.12
## :224 :227 :228 :114 Min. : 3.00 :218 :303 :150
## 1:194 1:138 1:123 1:127 1st Qu.:15.00 1: 36 7 : 59 1:270
## 2:102 2:155 2:120 2:251 Median :17.00 2:231 1 : 44 2:100
## 3: 49 3: 28 Mean :16.32 3: 35 6 : 43
## 3rd Qu.:18.00 3 : 34
## Max. :24.00 2 : 18
## NA's :415 (Other): 19
## x3.13 x3.13_8_TEXT x3.14 x3.14_7_TEXT x3.15
## :179 Length:520 :281 Length:520 :136
## 1 : 95 Class :character 1 : 85 Class :character 1:104
## 5 : 55 Mode :character 6 : 75 Mode :character 2:262
## 7 : 46 8 : 33 3: 18
## 4 : 26 2 : 14
## 3 : 25 3 : 13
## (Other): 94 (Other): 19
## x3.16 x3.17 x3.17_7_TEXT x3.18 x3.19 x3.20 x4.1
## :280 :170 Length:520 :127 :143 :126 :252
## 5 :134 5 : 91 Class :character 1:203 1: 53 1: 41 1: 54
## 4 : 35 6 : 47 Mode :character 2:176 2:294 2:310 2:172
## 8 : 26 3 : 38 3: 14 3: 30 3: 43 3: 42
## 1 : 17 8 : 35
## 2 : 17 2 : 32
## (Other): 11 (Other):107
## x4.2 x4.3 x4.4 x4.5 x4.6 x4.7 x4.8 x4.9 x4.10
## :471 :468 :469 :468 :468 :471 :243 :448 :438
## 1: 34 1: 30 1: 15 1: 36 1: 35 1: 21 1: 82 16 : 14 1: 23
## 2: 15 2: 22 2: 36 2: 16 2: 17 2: 28 2:195 18 : 13 2: 59
## 19 : 10
## 20 : 10
## 15 : 7
## (Other): 18
## x4.10_1_TEXT x4.11 x4.12 x4.13 x4.14 x4.15
## Length:520 :460 :442 :452 :438 :454
## Class :character 1 : 28 1: 14 3 months: 6 1: 37 1: 12
## Mode :character 2 : 11 2: 62 1 : 4 2: 40 2: 48
## one : 5 3: 2 7 months: 3 3: 5 3: 6
## 3 : 4 1 month : 2
## 4 : 4 1 yr : 2
## (Other): 8 (Other) : 51
## x4.16 x4.17 x4.18 x4.19 x4.20 x4.21 x4.22 x5.23 x4.24
## :440 :439 :439 :439 :442 :457 :441 :442 :442
## 1: 43 1: 52 1: 50 1: 24 1: 52 1 : 33 1: 70 1: 55 1: 58
## 2: 37 2: 29 2: 31 2: 57 2: 26 4 : 12 2: 9 2: 7 2: 20
## 2 : 7 3: 9
## 3 : 3 4: 1
## 1,2,3 : 2 5: 2
## (Other): 6 6: 4
## x4.25 x4.26 x4.27 x4.28 x4.29 x4.30 x5.1 x5.2 x5.3
## :445 :440 :442 :440 :442 :442 :438 :435 :437
## 1: 53 1: 38 1: 12 1: 13 1: 5 1: 19 1: 61 1: 28 1: 69
## 2: 22 2: 42 2: 45 2: 67 2: 72 2: 57 2: 7 2: 57 2: 14
## 3: 21 3: 1 3: 2 3: 4
## 4: 9
## 5: 1
##
## x5.4 x5.5 x5.6 x5.7 x5.8 x5.9 x5.10 x5.11 x5.12
## :436 :434 :432 :436 :436 :435 :439 :436 :437
## 1: 62 1: 41 1: 76 1: 75 1: 66 1: 81 1: 6 1: 51 2 : 31
## 2: 22 2: 38 2: 12 2: 9 2: 18 2: 4 2: 23 2: 33 1 : 23
## 3: 7 3: 18 4 : 14
## 4: 32 3 : 11
## 5: 2 1,2 : 1
## (Other): 3
## x5.13 x5.14 x5.15 x5.16 x6.1 x6.2 x6.3 x6.4 x7.1
## :436 :436 :437 :432 : 59 : 86 : 68 : 73 : 41
## 1: 33 1: 28 1: 17 1: 56 1:197 1 : 89 1:185 1:282 1:409
## 2: 51 2: 46 2: 41 2: 19 2:251 1,3: 4 2:230 2:165 2: 40
## 3: 10 3: 25 3: 13 3: 13 2 :317 3: 37 3: 30
## 2,3: 1
## 3 : 23
##
## x7.2 x7.3 x7.4 x7.5 x7.6 x7.7 x7.8 x7.9
## : 66 :409 : 48 :302 : 49 : 44 : 42 4 :189
## 1: 49 1: 4 1:126 1: 10 1:343 1: 6 1:256 :141
## 2:405 2: 9 2:313 2: 10 2: 42 2: 81 2:155 1 : 47
## 3: 26 3: 33 3: 46 3: 17 3:167 3: 67 5 : 38
## 4: 18 4: 62 4: 69 4:119 3 : 31
## 5: 54 5: 90 5:103 4,5 : 21
## (Other): 53
## x7.9_6_TEXT x7.10 x7.11 x7.12 x7.13 x7.14 x7.15 x7.16
## Length:520 : 47 : 48 : 42 :220 : 63 : 47 : 56
## Class :character 1:270 1: 16 1: 6 1: 3 1:348 1:367 1:334
## Mode :character 2:203 2: 26 2: 33 2: 17 2: 38 2: 49 2: 86
## 3:208 3:140 3: 63 3: 71 3: 57 3: 44
## 4:222 4:145 4:217
## 5: 8
## 6:146
## x7.17 x8.1 x8.2 x8.3 x8.4 x8.5 x8.6 x8.7 x8.8
## : 63 : 45 : 98 : 30 : 33 : 34 : 31 : 41 : 39
## 1:190 1:208 1:200 1: 57 1: 54 1: 58 1: 20 1:208 1: 41
## 2:198 2:217 2:222 2: 59 2:126 2: 94 2: 29 2:177 2:125
## 3: 69 3: 11 3: 45 3: 70 3: 74 3: 35 3: 62 3: 67
## 4: 39 4:140 4:116 4:162 4:166 4: 15 4:139
## 5:189 5:121 5: 98 5:239 5: 17 5:109
##
## x8.9 x9.1 x9.2 x9.3 x9.4 x9.5 x9.6 x9.7 x9.8
## : 40 : 36 : 37 : 39 : 24 : 25 : 29 : 27 : 28
## 1: 44 1:171 1:172 1:170 1:117 1:169 1:185 1:189 1:219
## 2:101 2:260 2:262 2:246 2:315 2:252 2:233 2:239 2:218
## 3: 63 3: 23 3: 37 3: 39 3: 39 3: 34 3: 40 3: 41 3: 37
## 4:171 4: 30 4: 12 4: 26 4: 25 4: 40 4: 33 4: 24 4: 18
## 5:101
##
## x9.9
## : 25
## 1:313
## 2:145
## 3: 22
## 4: 15
##
##
# delete over 20 and under 13
data <- data[data$'x2.1' <= 20, ]
data <- data[data$'x2.1' >= 13, ]
# Remove rows where gender is NA or an empty string
data <- data[!is.na(`x2.2`) & `x2.2` != "", ]
# delete in age is is NA or an empty string
data <- data[!is.na(`x2.1`) & `x2.1` != "", ]
#CHECK FOR MALE CONDITIONALS
#change pregnant to NA if indicated Yes and Male
data$'x4.8'[data$'x2.2' == 2 & data$'x4.8' == 1] <- NA
#change menstual to NA if indicated Yes and Male
data$'x3.1'[data$'x2.2' == 2 & (data[["x3.1"]] == 1 | data[["x3.1"]] == 2)] <- NA
# change menstrual age to NA if age and Male
data[["x3.1_1_TEXT"]][data[["x2.2"]] == 2 & data[["x3.1_1_TEXT"]] != ""] <- NA
# change menstrual prodict to NA if age and Male
data[["x3.2"]][data[["x2.2"]] == 2 & data[["x3.2"]] != ""] <- NA
data[["x3.3"]][data[["x2.2"]] == 2 & data[["x3.3"]] != ""] <- NA
data[["x3.4"]][data[["x2.2"]] == 2 & data[["x3.4"]] != ""] <- NA
data[["x3.5"]][data[["x2.2"]] == 2 & data[["x3.5"]] != ""] <- NA
data[["x3.6"]][data[["x2.2"]] == 2 & data[["x3.6"]] != ""] <- NA
data[["x3.7"]][data[["x2.2"]] == 2 & data[["x3.7"]] != ""] <- NA
#CHECK FOR SEX ACTIVITY CONDITIONALS
data[["x3.9"]][data[["x2.2"]] != 1 & data[["x3.9"]] != ""] <- NA
#query to see duplicate village numbers
sqldf("SELECT [x1.2], COUNT(*) as count
FROM data
GROUP BY [x1.2]
HAVING COUNT(*) > 1")
## x1.2 count
## 1 NA 18
## 2 1 2
## 3 17 2
## 4 30 3
## 5 202 2
## 6 205 2
## 7 207 2
## 8 208 2
## 9 209 2
## 10 210 2
## 11 211 2
## 12 265 2
## 13 270 2
## 14 271 2
## 15 436 2
## 16 444 2
## 17 451 2
## 18 456 2
## 19 543 3
## 20 607 2
#query checker for if male said they were pregnant
sqldf("SELECT [x2.2], [x4.8]
FROM data
WHERE [x2.2] = 2 AND [x4.8] = 1")
## [1] x2.2 x4.8
## <0 rows> (or 0-length row.names)
#query for male menstration
sqldf("SELECT [x2.2], [x3.1]
FROM data
WHERE [x2.2] = 2 AND ([x3.1] = 1 OR [x3.1] = 2)")
## [1] x2.2 x3.1
## <0 rows> (or 0-length row.names)
#query for male menstration age
sqldf("SELECT [x2.2], [x3.1_1_TEXT]
FROM data
WHERE [x2.2] = 2 AND [x3.1_1_TEXT] != ''")
## [1] x2.2 x3.1_1_TEXT
## <0 rows> (or 0-length row.names)
sqldf("SELECT [x2.2], [x3.2]
FROM data
WHERE [x2.2] = 2 AND [x3.2] != ''")
## [1] x2.2 x3.2
## <0 rows> (or 0-length row.names)
#3.3,3.4,3.5,3.6,3.7
sqldf("SELECT [x2.2], [x3.7]
FROM data
WHERE [x2.2] = 2 AND [x3.7] != ''")
## [1] x2.2 x3.7
## <0 rows> (or 0-length row.names)
# query for sex conditionals
sqldf("SELECT [x3.8], [x3.9]
FROM data
WHERE [x3.8] != 1 AND [x3.9] != ''")
## [1] x3.8 x3.9
## <0 rows> (or 0-length row.names)
#make variable codebook
codebook_output <- codebook(data)
## Warning: There was 1 warning in `dplyr::summarize()`.
## ℹ In argument: `dplyr::across(tidyselect::any_of(variable_names),
## mangled_skimmers$funs)`.
## ℹ In group 0: .
## Caused by warning:
## ! There were 126 warnings in `dplyr::summarize()`.
## The first warning was:
## ℹ In argument: `dplyr::across(tidyselect::any_of(variable_names),
## mangled_skimmers$funs)`.
## Caused by warning in `sorted_count()`:
## ! Variable contains value(s) of "" that have been converted to "empty".
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 125 remaining warnings.
## There was 1 warning in `dplyr::summarize()`.
## ℹ In argument: `dplyr::across(tidyselect::any_of(variable_names),
## mangled_skimmers$funs)`.
## ℹ In group 0: .
## Caused by warning:
## ! There were 126 warnings in `dplyr::summarize()`.
## The first warning was:
## ℹ In argument: `dplyr::across(tidyselect::any_of(variable_names),
## mangled_skimmers$funs)`.
## Caused by warning in `sorted_count()`:
## ! Variable contains value(s) of "" that have been converted to "empty".
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 125 remaining warnings.
# make label variables
for (i in seq_along(data)) {
var_label(data[[i]]) <- variable_labels[i]
}
#test for success
#var_label(data)
data$x2.2 <- labelled(
as.integer(data$x2.2), # Ensure it's numeric or integer
labels = c("Yes" = 1, "No" = 2)
)
#summary(data)
summary(data$x2.1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.00 15.00 18.00 17.12 19.00 20.00
# write new csv of clean data
fwrite(data, "kibera_values_cleaned.csv")
#write codebook
saveRDS(data, file = "codebook.rds")